
import scrapy
import json
import copy


class RegionCodeSpider(scrapy.Spider):
    name = 'region_code'
    type_list = ['provincetr', 'citytr', 'countytr', 'towntr', 'villagetr']
    base_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/'
    start_urls = [
        'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html']

    def parse(self, response):
        items = response.xpath("//tr[@class='provincetr']")
        for item in items:
            for i in item.xpath("td"):
                d = {}
                d["province_id"] = i.xpath(
                    "a/@href").extract_first().replace(".html", "")
                d["province_name"] = i.xpath("a/text()").extract_first()
                next_url = response.urljoin(i.xpath("a/@href").extract_first())
                yield scrapy.Request(next_url, callback=self.parse_city, meta={"item": d})

    def parse_city(self, response):
        items = response.xpath("//tr[@class='citytr']")
        meta_item = response.meta.get("item")
        for item in items:
            d = copy.deepcopy(meta_item)
            d["city_id"] = item.xpath("td[1]/a/text()").extract_first()
            d["city_name"] = item.xpath("td[2]/a/text()").extract_first()
            if d["city_id"] is None:
                d["city_id"] = item.xpath("td[1]/text()").extract_first()
                d["city_name"] = item.xpath("td[2]/text()").extract_first()
                yield d
            else:
                next_url = response.urljoin(
                    item.xpath("td[1]/a/@href").extract_first())
                yield scrapy.Request(next_url, callback=self.parse_county, meta={"item": d})

    def parse_county(self, response):
        items = response.xpath("//tr[@class='countytr']")
        meta_item = response.meta.get("item")
        for item in items:
            d = copy.deepcopy(meta_item)
            d["county_id"] = item.xpath("td[1]/a/text()").extract_first()
            d["county_name"] = item.xpath("td[2]/a/text()").extract_first()
            if d["county_id"] is None:
                d["county_id"] = item.xpath("td[1]/text()").extract_first()
                d["county_name"] = item.xpath("td[2]/text()").extract_first()
                yield d
            else:
                next_url = response.urljoin(
                    item.xpath("td[1]/a/@href").extract_first())
                yield scrapy.Request(next_url, callback=self.parse_town, meta={"item": d})

    def parse_town(self, response):
        items = response.xpath("//tr[@class='towntr']")
        meta_item = response.meta.get("item")
        for item in items:
            d = copy.deepcopy(meta_item)
            d["town_id"] = item.xpath("td[1]/a/text()").extract_first()
            d["town_name"] = item.xpath("td[2]/a/text()").extract_first()
            if d["town_id"] is None:
                d["town_id"] = item.xpath("td[1]/text()").extract_first()
                d["town_name"] = item.xpath("td[2]/text()").extract_first()
                yield d
            else:
                next_url = response.urljoin(
                    item.xpath("td[1]/a/@href").extract_first())
                yield scrapy.Request(next_url, callback=self.parse_village, meta={"item": d})

    def parse_village(self, response):
        items = response.xpath("//tr[@class='villagetr']")
        meta_item = response.meta.get("item")
        for item in items:
            d = copy.deepcopy(meta_item)
            d["village_id"] = item.xpath("td[1]/text()").extract_first()
            d["village_name"] = item.xpath("td[3]/text()").extract_first()
            print(d)
            yield d

    def parse_other(self, response):
        cls = response.xpath("//tr[contains(@class,'tr')]/@class").get()

        items = response.xpath("//tr[contains(@class,'tr')]")

        meta_item = response.meta.get("item")
        for item in items:
            d = copy.deepcopy(meta_item)
            url = item.xpath("td[1]/a/@href").extract_first()
            if url is None:
                code = item.xpath("td[1]/text()").extract_first()
                name = item.xpath("td[2]/text()").extract_first()
                if cls == 'villagetr':
                    d["village_id"] = item.xpath(
                        "td[1]/text()").extract_first()
                    d["village_name"] = item.xpath(
                        "td[3]/text()").extract_first()
                elif cls == 'citytr':
                    d["city_id"] = code
                    d["city_name"] = name
                elif cls == 'countytr':
                    d["county_id"] = code
                    d["county_name"] = name
                elif cls == 'towntr':
                    d["town_id"] = code
                    d["town_name"] = name
                    print(d)
                yield d
            else:
                code = item.xpath("td[1]/a/text()").extract_first()
                name = item.xpath("td[2]/a/text()").extract_first()
                if cls == 'citytr':
                    d["city_id"] = code
                    d["city_name"] = name
                elif cls == 'countytr':
                    d["county_id"] = code
                    d["county_name"] = name
                elif cls == 'towntr':
                    d["town_id"] = code
                    d["town_name"] = name
                print(d)
                next_url = response.urljoin(url)
                yield scrapy.Request(next_url, callback=self.parse_other, meta={"item": d})
